使用go.Scatter()函数可以绘制散点图、气泡图和线形图,常用参数如下:
# 3-1 Line and Scatter Plots 多类型散点图
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0, 1, N) # linspace()函数返回N个0-1均匀分布的样本
y0 = np.random.randn(N) + 5
y1 = np.random.randn(N) # random.randn()函数返回N个服从标准正态分布(均值0方差1)的随机样本
y2 = np.random.randn(N) - 5
fig = go.Figure()
# 添加traces
fig.add_trace(go.Scatter(x=x, y=y0,
mode='markers', # 纯散点的绘图
name='markers'))
fig.add_trace(go.Scatter(x=x, y=y1,
mode='lines+markers', # 散点+线的绘图
name='lines+markers'))
fig.add_trace(go.Scatter(x=x, y=y2,
mode='lines', # 线的绘图
name='lines'))
fig.show()
# 3-2 Styled Scatter Plots 风格化散点图
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0, 1, N) # linspace()函数返回N个0-1均匀分布的样本
y0 = np.random.randn(N) + 5
y1 = np.random.randn(N) # random.randn()函数返回N个服从标准正态分布(均值0方差1)的随机样本
y2 = np.random.randn(N) - 5
fig = go.Figure()
# 添加traces
fig.add_trace(go.Scatter(x=x, y=y0,
mode='markers', # 纯散点的绘图
name='markers',
marker=dict(
size=10, # 设置点的大小
color='#f79337' # 设置点的颜色
)))
fig.add_trace(go.Scatter(x=x, y=y1,
mode='lines+markers', # 散点+线的绘图
name='lines+markers',
marker=dict(
size=10,
color='#E15759',
line=dict(width=1, color='rgba(0,0,0,0.2)') # 设置点的描边大小和颜色
)))
fig.add_trace(go.Scatter(x=x, y=y2,
mode='lines', # 线的绘图
name='lines',
line=dict(
width=2, # 设置线条的宽度
color='#76B7B2' # 设置线条的颜色
)))
fig.show()
# 3-3 Simple Bubble Chart 气泡图_设置气泡大小
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0,1,N)
y = np.random.randn(N)
z = np.random.randint(50,size=N) # random.randint()函数返回N个50以内的整数
fig = go.Figure(data=go.Scatter(
x=x,y=y,
mode='markers',
marker=dict(size=z), # 设置气泡的大小,可当作第三个维度
))
fig.show()
# 延伸:缩放气泡大小可以使用属性sizeref,建议使用以下公式来计算sizeref值:
# sizeref =2.* max(size数组)/(所需的标记 ** 2)
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0,1,N)
y = np.random.randn(N)
z = np.random.randint(500,size=N) # random.randint()函数返回N个500以内的整数
fig = go.Figure(data=go.Scatter(
x=x,y=y,
mode='markers',
marker=dict(
size=z,
# sizemode='area',
sizeref=2.*max(z)/(10**2) # 使用属性sizeref缩放气泡大小
)))
fig.show()
# 3-4 Simple Bubble Chart 气泡图_设置气泡颜色
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0,1,N)
y = np.random.randn(N)
z = np.random.randint(50,size=N) # random.randint()函数返回N个50以内的整数
fig = go.Figure(data=go.Scatter(
x=x,y=y,
mode='markers',
text=z, # 设置显示的文本
marker=dict(size=z, # 固定气泡大小
color=z, # 设置气泡的颜色,可当作第三个维度
colorscale='Viridis', # 设置colorscale(颜色范围)
showscale=True, # 设置是否显示scale(右侧的颜色条)
opacity=0.6 # 设置气泡的不透明度
)))
fig.show()
# 3-5 Simple Bubble Chart 气泡图_自定义文本标签
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0,1,N)
y = np.random.randn(N)
z = np.random.randint(50,size=N) # random.randint()函数返回N个50以内的整数
text=[]
for i in range(N):
text.append(('x={0}<br>y={1}<br>size={2}').format(np.round(x[i],2),np.round(y[i],2),z[i]))
fig = go.Figure(data=go.Scatter(
x=x,y=y,
mode='markers',
text=text,
hoverinfo='text',
marker=dict(size=z, # 设置气泡的大小,可当作第三个维度
color=z, # 设置气泡的颜色,可当作第三个维度
colorscale='Viridis', # 设置colorscale(颜色范围)
showscale=True, # 设置是否显示scale(右侧的颜色条)
opacity=0.6 # 设置气泡的不透明度
)))
fig.show()
导入数据文件'Sample - Superstore.xls',绘制散点图,展示商品子类别(Sub-Category)中'Paper'销售额(Sales)和利润(Profit)的相关关系,用气泡的颜色来展示Discount的取值大小,从而进一步分析这些变量之间的关系。
# Step1. 导入文件'Sample - Superstore.xls',对'Sub-Category'中'Paper'产品进行筛选
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
data = df[df['Sub-Category']=='Paper'] # 布尔值索引
data.head()
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | 13 | CA-2018-114412 | 2018-04-15 | 2018-04-20 | Standard Class | AA-10480 | Andrew Allen | Consumer | United States | Concord | ... | 28027.0 | South | OFF-PA-10002365 | Office Supplies | Paper | Xerox 1967 | 15.552 | 3 | 0.2 | 5.4432 |
| 34 | 35 | CA-2018-107727 | 2018-10-19 | 2018-10-23 | Second Class | MA-17560 | Matt Abelman | Home Office | United States | Houston | ... | 77095.0 | Central | OFF-PA-10000249 | Office Supplies | Paper | Easy-staple paper | 29.472 | 3 | 0.2 | 9.9468 |
| 56 | 57 | CA-2017-111682 | 2017-06-17 | 2017-06-18 | First Class | TB-21055 | Ted Butterfield | Consumer | United States | Troy | ... | 12180.0 | East | OFF-PA-10001569 | Office Supplies | Paper | Xerox 232 | 32.400 | 5 | 0.0 | 15.5520 |
| 58 | 59 | CA-2017-111682 | 2017-06-17 | 2017-06-18 | First Class | TB-21055 | Ted Butterfield | Consumer | United States | Troy | ... | 12180.0 | East | OFF-PA-10000587 | Office Supplies | Paper | Array Parchment Paper, Assorted Colors | 14.560 | 2 | 0.0 | 6.9888 |
| 64 | 65 | CA-2016-135545 | 2016-11-24 | 2016-11-30 | Standard Class | KM-16720 | Kunst Miller | Consumer | United States | Los Angeles | ... | 90004.0 | West | OFF-PA-10003892 | Office Supplies | Paper | Xerox 1943 | 146.730 | 3 | 0.0 | 68.9631 |
5 rows × 21 columns
# Step2. 绘制散点图
import plotly.graph_objects as go
fig.update_layout(
title='Sales and Profit Distrubution of Paper',
xaxis=dict(title='Sales'),
yaxis=dict(title='Profit')
)
fig.show()
# 延伸;用气泡的颜色来展示不同地区(Region)的数据点
fig = go.Figure()
fig.update_layout(
title='Sales and Profit Distrubution of Paper by Region',
xaxis=dict(title='Sales'),
yaxis=dict(title='Profit')
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制气泡图,展示销售额最高的前50名客户的销售额(X轴)和利润(Y轴)的关系,气泡的大小size和颜色color均体现折扣(Discount)这一变量,交互时增加显示的文本text:Customer Name和Discount(如交互所示)。
# Step1. 导入文件'Sample - Superstore.xls',数据分析得到销售额最高的前50名客户的销售额(Sales)、利润(Profit)和折扣(Discount)
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
data = df.groupby('Customer Name').agg({'Sales':'sum','Discount':'mean','Profit':'sum'})
data = data.sort_values(by='Sales',ascending=False)
data = data[:50]
data.head()
| Sales | Discount | Profit | |
|---|---|---|---|
| Customer Name | |||
| Sean Miller | 25043.050 | 0.246667 | -1980.7393 |
| Tamara Chand | 19052.218 | 0.116667 | 8981.3239 |
| Raymond Buch | 15117.339 | 0.094444 | 6976.0959 |
| Tom Ashbrook | 14595.620 | 0.080000 | 4703.7883 |
| Adrian Barton | 14473.571 | 0.240000 | 5444.8055 |
# Step3. 绘制气泡图
import plotly.graph_objects as go
fig.update_layout(
title='Top-50 Customers\' Sales and Profit',
xaxis_title='Sales',
yaxis_title='Profit'
)
fig.show()
Plotly对时间序列的支持比较友好,既支持字符串格式,又支持日期/时间格式。只要传入的参数x是datetime.datetime对象,或者字符串strings,Plotly会自动识别为时间格式。
如果只想展示部分时间范围内的绘图结果,可以在布局layout中通过传递一个range参数即可实现这个功能。如果想要恢复默认的时间范围,可以单击界面右上角的Autoscale按钮。
# 3-6 Time Series 时间序列
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import datetime
# datetime.datetime对象
x0 = [datetime.datetime(year=2020, month=10, day=1),
datetime.datetime(year=2020, month=10, day=3),
datetime.datetime(year=2020, month=10, day=5)]
y0 = [0,1,2]
# 字符串对象或array对象
x1 = np.array(['2020-10-01','2020-10-03','2020-10-05'])
y1 = [2,1,0]
# 使用pandas的date_range生成datetime对象
x2 = pd.date_range('20201001',periods=31)
y2 = np.random.randn(31)
fig = go.Figure()
fig.add_trace(go.Scatter(x=x0,y=y0,name='trace_datetime'))
fig.add_trace(go.Scatter(x=x1,y=y1,name='trace_string'))
fig.add_trace(go.Scatter(x=x2,y=y2,name='trace_daterange'))
fig.update_traces(opacity=0.8)
# 使用range参数展示部分时间范围内的绘图结果
fig.update_layout(
xaxis=dict(range=[x2[0],x2[6]]) # 只显示前7天的绘图结果
# xaxis_range=[datetime.datetime(2020,10,1),datetime.datetime(2020,10,7)]
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制时间序列图,展示2018年每天的销售额(Sales)和利润(Profit)。
# 导入数据并进行数据分析
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
# 每天销售额和利润
data = df.groupby('Order Date')[['Sales','Profit']].sum()
data
| Sales | Profit | |
|---|---|---|
| Order Date | ||
| 2015-01-03 | 16.4480 | 5.5512 |
| 2015-01-04 | 288.0600 | -65.9901 |
| 2015-01-05 | 19.5360 | 4.8840 |
| 2015-01-06 | 4407.1000 | 1358.0524 |
| 2015-01-07 | 87.1580 | -71.9621 |
| ... | ... | ... |
| 2018-12-26 | 814.5940 | 61.1202 |
| 2018-12-27 | 177.6360 | -31.9742 |
| 2018-12-28 | 1657.3508 | 253.1188 |
| 2018-12-29 | 2915.5340 | 644.4338 |
| 2018-12-30 | 713.7900 | 101.5365 |
1236 rows × 2 columns
# 索引和切片
print(data.loc['2018'],'\n') # 2018年的记录
print(data.loc['2018-02'],'\n') # 2018年2月的记录
print(data['2018-12-01':'2018-12-15']) # 切片
Sales Profit
Order Date
2018-01-01 1481.8280 -181.4109
2018-01-02 2079.5540 -207.0473
2018-01-03 2070.2720 704.2800
2018-01-06 33.7400 15.5204
2018-01-07 3395.5900 758.7192
... ... ...
2018-12-26 814.5940 61.1202
2018-12-27 177.6360 -31.9742
2018-12-28 1657.3508 253.1188
2018-12-29 2915.5340 644.4338
2018-12-30 713.7900 101.5365
[322 rows x 2 columns]
Sales Profit
Order Date
2018-02-02 913.3540 170.6770
2018-02-03 922.3270 215.5700
2018-02-04 32.6700 8.4942
2018-02-05 2263.0120 74.8820
2018-02-06 904.3540 204.3158
2018-02-09 773.7640 -411.9726
2018-02-10 227.1030 28.1274
2018-02-11 1241.5160 130.1018
2018-02-13 1058.4300 424.3345
2018-02-16 1337.4420 95.9756
2018-02-17 2964.8174 -383.5478
2018-02-18 287.3260 62.4082
2018-02-19 1314.5900 377.0515
2018-02-20 1150.2900 -107.5121
2018-02-21 47.9040 -2.9940
2018-02-23 117.8000 42.3700
2018-02-24 1448.6760 249.3929
2018-02-25 430.4920 -19.3798
2018-02-26 2847.6460 447.3532
2018-02-28 17.6200 8.2242
Sales Profit
Order Date
2018-12-01 5331.178 718.8920
2018-12-02 9951.182 -7.3410
2018-12-03 1403.842 280.7407
2018-12-04 2639.638 -21.9881
2018-12-05 1453.136 447.6235
2018-12-06 10.680 2.8836
2018-12-07 2916.514 -2686.6673
2018-12-08 7643.041 1154.6045
2018-12-09 5470.390 1487.1418
2018-12-10 3873.559 715.5696
2018-12-11 2823.965 -82.4089
2018-12-13 580.936 99.2154
2018-12-14 3897.714 215.2500
2018-12-15 306.888 52.5946
# 绘制时间序列图:2018年每天的销售额(Sales)和利润(Profit)
import plotly.graph_objects as go
import pandas as pd
data = df.groupby('Order Date')[['Sales','Profit']].sum()
data = data.loc['2018']
fig.update_layout(
title='Sales and Profit in 2018'
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制时间序列图,展示2018年每个月的销售额(Sales)和利润(Profit)。
对Order Date进行groupby操作后,时间戳是每天(D),如果想要将其转换为每月(M),可以通过重新采样来实现。重新采样是指将时间序列从一个频率转换为另一个频率的过程。将更高频率的数据聚合到低频率被称为向下采样,反之则称为向上采样。Pandas对象配有resample方法,与groupby方法类似,调用resample时需要对数据分组,之后再调用聚合函数。
# 重新采样
data = df.groupby('Order Date')[['Sales','Profit']].sum()
data = data.loc['2018'].resample('M').sum()
data
| Sales | Profit | |
|---|---|---|
| Order Date | ||
| 2018-01-31 | 43971.3740 | 7140.4391 |
| 2018-02-28 | 20301.1334 | 1613.8720 |
| 2018-03-31 | 58872.3528 | 14751.8915 |
| 2018-04-30 | 36521.5361 | 933.2900 |
| 2018-05-31 | 44261.1102 | 6342.5828 |
| 2018-06-30 | 52981.7257 | 8223.3357 |
| 2018-07-31 | 45264.4160 | 6952.6212 |
| 2018-08-31 | 63120.8880 | 9040.9557 |
| 2018-09-30 | 87866.6520 | 10991.5556 |
| 2018-10-31 | 77776.9232 | 9275.2755 |
| 2018-11-30 | 118447.8250 | 9690.1037 |
| 2018-12-31 | 83829.3188 | 8483.3468 |
data.index.strftime('%Y-%m') # 转换时间格式
Index(['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06',
'2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12'],
dtype='object', name='Order Date')
# 绘制时间序列图:2018年每个月的销售额(Sales)和利润(Profit)
fig.update_layout(
title='Sales and Profit in 2018',
xaxis=dict(dtick='M1') # X轴刻度显示为每一个月
)
fig.show()